/*******************************************************************************
/* This file combines differnet firm headers files into one */
/* The first input file is rep.hqadd_expanded, which is extracted from 10-k header files. It 
covers from 1994 to 2016 (early 2017 for some firms)*/
/* The second input file is rep.headerhist, downloaded from crsp_compustat merged database on wrds. It covers
from 2007 to 2018 June*/
*******************************************************************************/

%let input1 = rep.hqadd_expanded; 
%let input2 = rep.headerhist; 


** Combining the two datasets become very inconvinient. 
* This is because that the crsp historical headquarter adddresses are using multiple sec fillings;
* As a result, the effecitve date ranges for each address is different than compared to the addresses parsed from 10-k. 
* Another difference is that the crsp addresses uses fidate (filing date) to determine the effective date ragens;
* Change the effective starting and ending dates for each address-cik pair makes more sense;

* The purpose of the following code is to combine two files and creat an indicator variable for whether the address
  comes from first, second, or both datasets;

* Consolidate the input 1 using filing date; 
* One draw back using proc sql is that it cannot handle switch back to the same address;
proc sort data = &input1 out = temp1; by cik fidate; run; 
* create group id for zip within each firm;
data temp2;
  set temp1; 
  by cik fidate; 
  if first.cik then zip_group =0;
  if first.cik then lag_zip = .;
  lag_zip = lag(ba_zip5);
  if ba_zip5 ne lag_zip then zip_group +1; 
  run; 
* Identify the earliest and latest filing date associated with each firm-zip pairs; 
proc sql;
  create table temp3
  as select distinct cik, min(fidate) as beg_date format date9., 
    max(fidate) as end_date format date9., zip_group,
    ba_city, ba_state, /*only need zip, city, and state */
    ba_zip5, latitude, longitude, zip5, dist
  from temp2 as a
  group by cik, zip_group;
  quit; 
  * There are duplicates, resulted from the same zip being associated with differnet city or streets;
  proc sort data = temp3; by cik zip_group beg_date descending end_date; run;
  data temp3; 
    set temp3;
    by cik zip_group beg_date; 
    if first.beg_date; run;  
* Fill in the gap: if firm change address between t and t+1, assume that the old address is effective until t+1; 
proc sort data = temp3; by cik descending zip_group; run;
data temp3; 
  set temp3;
  by cik descending zip_group; 
  lag_beg_date = lag(beg_date);
  format lag_beg_date date9.;
  if first.cik then lag_beg_date = .;
  if first.cik = 0 then end_date =  intnx('day',lag_beg_date, -1);
  *If a cik's last distinct address is only identified in one filings, then its beg_date = end_date; 
    * In this case, assume it is valid until year end; 
  if first.cik and beg_date = end_date then end_date = intnx('year', end_date, 0, 'E');
  drop lag_beg_date; 
  run; 
proc sort data = temp3; by cik zip_group; run; 

* Input2 (crsp) files are already in very nice format;
* Just need to consolidate to firm-zip pairs; 
proc sort data = &input2; by hcik hchgdt; run; 
data temp4;
  set &input2;
  ba_zip5 = substr(haddzip, 1, 5);
  zip5 = input(ba_zip5, 5.);
  cik = hcik*1; 
  hcity = upcase(hcity); 
  rename hchgdt = beg_date hchgenddt = end_date hcity = ba_city hstate=ba_state;
/*  keep cik hchgdt hchgenddt hcity hstate ba_zip5 zip5; */
  if hcik ne .; 
  run; 
* create group id for zip within each firm;
proc sort data = temp4; by cik beg_date; run; 
data temp5;
  set temp4; 
  by cik beg_date; 
  if first.cik then zip_group =0;
  if first.cik then lag_zip = .;
  lag_zip = lag(ba_zip5);
  if ba_zip5 ne lag_zip then zip_group +1; 
  run; 
* Identify the earliest and latest filing date associated with each firm-zip pairs; 
* First check countries of all headquarters ( there are 300+ missing values, all with missing address and city);
data missing_country;
  set temp5;
  if hloc = "";
  run; 
proc sql;
  create table temp6
  as select distinct cik, min(beg_date) as beg_date format date9., 
    max(end_date) as end_date format date9.,
    ba_city, ba_state, ba_zip5, zip5, zip_group
  from temp5(where = (hloc = "USA")) /*foucs on us address*/
  where (ba_city ne ' ' or ba_zip5 ne ' ')
  group by cik, zip_group; 
  quit; 
* Check duplicates;
  * It happens if a firm change city name but keep the same zip codes;
  * It also happens if the same st addresses are spelled in different ways;
  proc sort data = temp6 nodupkey dupout = check;
    by cik zip_group; 
    run; 

* Now, link each zip code with one lat and long coordinate from the 4606 addresses;
* Frist try to match directly with hqadd_expanded and then with rep.loc with minimum distance;

* There are duplicates in zip_geocoding pairs in rep.hqadd_expanded;
proc sql;   
  create table distinct_zips
  as select distinct ba_zip5, latitude, longitude, dist 
  from rep.hqadd_expanded; 
  quit; 

proc sort data = distinct_zips nodupkey dupout = check out = distinct_zip2; 
  by ba_zip5;
  run; 
* There are 21 dups from check;
* To remove the dups, sort on zip5 and dist. Keep only the first.zip5;
proc sort data = distinct_zips; 
  by ba_zip5 dist;
  run;
data distinct_zips; 
  set distinct_zips;
  by ba_zip5 dist;
  if first.ba_zip5;
  run; * Now distinct_zips contains 6803 distinct obs;

* Match crsp address with geocodes by lookup ba_zip5 in distinct_zips;
proc sql;
  create table temp7
  as select distinct a.*, b.longitude, b.latitude, b.dist
    from temp6 as a left join distinct_zips as b
  on a.ba_zip5 = b.ba_zip5;
  quit; * This leaves the temp7 with the same obs as with temp6 (36988);
/**************************************************************************
  * There are obs missing zips;
  data number_missing_zips; * 995 obs; 
    set temp7; 
    if ba_zip5 = " "; 
    run; 
  data number_missing_zipsandcity; * 0 obs; 
    set temp7;
    if ba_zip5 = " " and ba_city = " ";
    run; 
  data number_missing_city; * 63 obs;
    set temp7;
    if ba_city = " ";
    run; 
  * There are 13945 obs not matched using zips; 
  data number_unmatched;
    set temp7;
    if latitude = .;
    run;
**************************************************************************/
* THere are 2572 obs with valid numerical zips but unmatched;
data unmatched_with_zip;
  set temp7;
  if latitude = . and zip5 ne .;
  run;
* Match directly by comparing distances with 4604 locations; 
proc sql;
  * use sashelp.zipcode to geocode zip for those not even matched to rep.hqadd;
  create table temp_linked2
  as select distinct a.*, b.X, b.Y
  from unmatched_with_zip as a left join sashelp.zipcode as b
  on a.zip5 = b.zip;
/*or (strip(upcase(a.ba_state)) = strip(upcase(b.statecode)) and  strip(upcase(a.ba_city_1)) = strip(upcase(b.city)));*/
  quit;
* nearest loc;
proc sql;
  create table zip_geo
  as select distinct latitude, longitude
  from rep.loc;

  * having the smallest distance;
  create table temp_linked3
  as select distinct a.*, b.latitude, 
             b.longitude,
             geodist(a.Y, a.X, b.latitude, b.longitude) as dist
             /* default distances are in kilometers */
  from temp_linked2 (drop = latitude longitude dist) as a left join zip_geo as b
  on a.Y ne .
  group by a.cik, a.beg_date, a.ba_zip5
  having calculated dist = min(calculated dist);
  quit; 

* Now append this back to the original files; 
data temp8;
  set temp7 (where = (latitude ne . or zip5 = .))
    temp_linked3 (drop = x y); 
  run; 
** Check missing values, 211 obs missing latitude; 
  data still_missing_geo;
      set temp8;
      if latitude = .;
      run;
  * Only less than 1% are not geocoded;

/***********************************************************************/
/***************** Combining two resources *****************************/
/***********************************************************************/

* Create source variables;
* And keep character variables the same length;
  data temp3;
    set temp3;
    source = 1;
    format 
         ba_city $104.
         ba_state $12.
         ba_zip5 $10.;
    run; 
  data temp8;
    set temp8; 
    source = 2;
    format 
         ba_city $104.
         ba_state $12.
         ba_zip5 $10.;
    run; 

** Characterize diferent linking types;
  * Type 1 -- a cik appeared on both the source files
        then use input1 for pre-2007 and input2 for post-2007;
  * Type 2 -- a cik only appeared on input1;
  * Type 3a -- a cik only appeared on input2 but first.date is after 2007;
  * Type 3b -- a cik only apppeare on input2 but first.date is in 2007;
  
  ** Identify the types;
  data combined1;
    length 
          ba_city $ 104
          ba_state $ 12
          ba_zip5 $ 24;
    set temp3 temp8;
    run; 

  proc sql;
    create table cik_type
    as select distinct cik, sum(source) as category
    from (select distinct cik, source from combined1) 
    group by cik;
    quit; 
  proc sort data = cik_type; by category; run; 
    * Only has 1-3 values for category; 

  *** Rename the type;
  data cik_type;
    set cik_type;
    if category = 1 then type = 2;
    if category = 2 then type = 3; 
    if category = 3 then type = 1; 
    run; 
  proc sort data = cik_type; by cik type; run; 
  ** There are 38% type1, 38% type2, and 24% type3 of all 43218 ciks; 
  proc freq data = cik_type;
    table type;
    run; 
  *** Merge back cik types;
  proc sql;
    create table type
    as select distinct a.*, b.type
    from combined1 as a, cik_type as b
    where a.cik = b.cik
    order by a.cik, a.source, a.beg_date;
    quit;  
  * identity the first beg_date for each cik from input2;
  proc sort data = type; by cik descending source beg_date; run;
  * Type 1;
  data type;
    set type;
    by cik;
    retain lala;
    if first.cik then 
      do; 
        fbeg = beg_date;
        lala = beg_date;
      end;
      else fbeg = lala;
    drop lala; 
    format fbeg date9.;
    run;
  data type1;
    set type(where = (type = 1));
    if beg_date >= fbeg and source = 1 then delete;
    if latitude ne .;
    run; 
  proc sort data = type1; by cik descending beg_date; run; 
  data type1;
    set type1;
    by cik;
    lag_beg_date = lag(beg_date);
    if first.cik then lag_beg_date = .;
    if first.cik = 0 then end_date =  intnx('day',lag_beg_date, -1);
    drop lag_beg_date;
    run ;
  * Type 2; 
  data type2;
    set type (where = (type = 2));
    if latitude ne .;
    run;
  * Type 3;
  data type3;
    set type (where = (type=3));
    if year(fbeg)>2007 then type = 3.1; *4519 obs;
    if year(fbeg)=2007 then type = 3.2; *9839 obs; 
    if latitude ne .;
    run; 
/*  proc freq data = type3;*/
/*    table type;*/
/*    run; */

  *** Merge type1-type3;
  data alltypes;
    set type1-type3;
    run; 


  *** Match with county fips;
  proc sql;
    create table alltypes1
    as select distinct a.*, 1000*b.state+b.county as county
    from alltypes as a left join sashelp.zipcode as b
    on a.zip5 = b.zip;
    quit;  
  * Check unmatched;
    data check; set alltypes1; if county = . ; run; 
    data chek1; set sashelp.zipcode; if zip = 43308; run ;
    data check2; set temp3; if zip5 = 92613; run; 
    **** Use city name to match county;
    proc sql;
      create table alltypes2
      as select distinct a.*, 1000*b.state+b.county as county2
      from alltypes1 as a left join sashelp.zipcode as b
      on a.county = . and (strip(upcase(a.ba_state)) = strip(upcase(b.statecode)) 
        and  strip(upcase(a.ba_city)) = strip(upcase(b.city)));
      quit; 
    data check3; set alltypes2; if county = . and county2 = . ; run; * only 26 obs are left; 
  proc sort data = alltypes2; by cik beg_date; run ;
  * final cleaning;  
  data alltypes3;
    set alltypes2;
    if county =. then county = county2;
    drop county2; 
    run ;
    * It also happens that some city names are actually community names associated with two counties; 
    * Choose only one county for the purpose of the matching; 
    proc sort data = alltypes3 nodupkey out = alltypes4 dupout = check4; by cik beg_date; run; 


  ** Create no_hqadd_chg and urban dummy;
  %let dist_cutoff = 20; * the cutoff distance for address changes to be considered relocation; 
  data alltypes5;
    set alltypes4(drop = zip_group);
    by cik beg_date;
    laglat = lag(latitude);
    laglon = lag(longitude);
    move_dist = geodist(latitude, longitude, laglat, laglon);
    lag_county = lag(county);
    move1 = 0;
    move2 = 0;
    if first.cik then 
      do;
        laglat = .;
        laglon = .;
        move_dist = .; 
        lag_county = .;
        zip_group = 1;
        county_group = 0; 
      end;
    
    * two move dummies: one is move_dist >0 and the other is when county changes;
    if move_dist > &dist_cutoff then 
      do; 
        move1 = 1;
        zip_group +1;  
      end;
    if county ne lag_county then 
      do; 
        move2 = 1; 
        county_group +1; 
      end;
    if first.cik then 
      do;
        move1 = 0;
        move2 = 0; 
      end;
    run; 
  *** Number of changes; 
  proc sql;
    create table alltypes6
    as select distinct *, 
       sum(move1) as no_hqadd_zipchg, sum(move2) as no_hqadd_ctychg
    from alltypes5
    group by cik
    order by cik, beg_date;
    quit; 
  *** Urban dummies;
  * create rural vs urban indicator variables;
  * Follow Geographic Dissemination of Information, Journal of Coporate Finance, Loughran (2007);
  * Urban stocks are defined as New York City, Los Angeles, Chicago, Washington, San Francisco,
  * Philadelphia, Boston, Detroit, Dallas, or Houston;
  data alltypes6;
    set alltypes6;
    urban = 0; 
    if strip(upcase(ba_city)) in ('NEW YORK', 'LOS ANGELES', 'CHICAGO', 'WASHINGTON', 'SAN FRANCISCO', 
    'PHILADELPHIA', 'BOSTON', 'DETROIT', 'DELLAS', 'HOUSTON')
    then urban = 1; 
    run;
    *** Some stats on movers;
    proc sql;
      create table movers
      as select distinct cik, no_hqadd_zipchg, no_hqadd_ctychg
      from alltypes6
      order by cik;
      quit; 
    data check7;  set alltypes6; if latitude = . ; run ;
    **** it has zero obs of missing latitude or longitude; 
* For type = 2, extend its effective date to Sep, 2018;
    proc sort data = alltypes6; by cik beg_date; run ;
    data alltypes7;
      set alltypes6;
      by cik beg_date;
      if last.cik and type = 2 and year(end_date) in (2015, 2016) 
        then end_date = '01Sep2018'd; 
      run; 
* save the data;
  data rep.hqadd_combined_2018;
    set alltypes7;
    run; 
proc freq data = rep.hqadd_combined_2018;
  table type;
  run; 
* Combining CRSP files only increases 6 percent of new addresses; 
/*type Frequency Percent Cumulative Frequency Cumulative Percent */
/*1    43885 57.68 43885 57.68 */
/*2    18286 24.04 62171 81.72 */
/*3.1   4519  5.94 66690 87.66 */
/*3.2   9389 12.34 76079 100.00 */



